/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_full_structures.dta
			pdb_full_papers.dta
			pdb_full_entities.dta
			pdb_analysis.dta
			phat.dta
		
Outputs: 	Figures E2, E4
			Table E1

Purpose: Appendix analyses that don't naturally fit in other files.

*******************************************************************************/

	
/*------------------------------------------------------------------------------

	Figure E2: Predicting Single-Structure Projects

------------------------------------------------------------------------------*/

* Assemble data
clear all
use "${data_built}pdb_full_structures.dta"

* Restrict to structures where we know how many structures were in the paper
drop if structuresInPaper == .

* Dummies for one structure in the paper and one in the project
gen multiStructurePaper = (structuresInPaper > 1)
gen multiStructureProject = (structuresInProject > 1)

gen count = 1

* Look at papers we predict are single-structure
graph bar (percent) count if multiStructureProject == 0, 					///
	over(multiStructurePaper,  												///
	relabel(1 `" "Single-structure paper" "(Correctly predicted)" "' 		///
	2 `" "Multi-structure paper" "(Type I error)" "')) 						///
	ytitle("Percent of single-structure projects") ylabel(0(20)100) 		///
	bar(1,color(navy%30)) name(graph1) 										///
	title("Panel A: Predicted Single-Structure Papers")

* Look at papers that actually are single-structure
graph bar (percent) count if multiStructurePaper == 0, ///
	over(multiStructureProject, 											///
	relabel(1 `" "Single-structure project" "(Correctly predicted)" "' 		///
	2 `" "Multi-structure project" "(Type II error)" "')) 					///
	ytitle("Percent of single-structure papers") ylabel(0(20)100) 			///
	bar(1,color(navy%30)) name(graph2)										///
	title("Panel B: Actual Single-Structure Papers")

* Combine and save
graph combine graph1 graph2
graph save "${figures}figureE2.gph", replace
graph export "${figures}figureE2.pdf", replace
	
	
/*------------------------------------------------------------------------------

	Figure E4: LASSO Validation

------------------------------------------------------------------------------*/

* Assemble data
clear all
use "${data_built}pdb_full_structures.dta"
merge 1:1 structureId using "${data_built}pdb_analysis.dta"
assert _merge != 2
keep if _merge == 3
drop _merge

merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
keep if _merge != 2
drop _merge

merge 1:1 structureId using "${data_built}phat.dta"
assert _merge == 3
drop _merge

* Histogram of predicted vs. actual citations
twoway (hist ptileCites3, start(0) width(6.67) fc(navy%30) lc(navy%30)) 	///
	(hist predPtileCites3, start(0) width(6.67) fc(none) lc(black)),		///
	title("Panel A: Histogram") xtitle("Three-year citation percentile") 	///
	ytitle("Density") legend(order(1 "Actual" 2 "Predicted") 				///
	position(6) row(1)) name(hist)
			
* Binscatter of predicted vs. actual citations
binscatter ptileCites3 predPtileCites3, 									///
	xtitle("Predicted three-year citation percentile") 						///
	title("Panel B: Binned scatterplot") 									///
	ytitle("Actual three-year citation percentile") 						///
	mcolors(navy%30) lcolors(navy)											///
	xlabel(20(20)80) ylabel(20(20)80) name(binscatter)
		
* Combine and save
graph combine hist binscatter, xsize(4) ysize(2)
graph save "${figures}figureE4.gph", replace
graph export "${figures}figureE4.pdf", replace

/*------------------------------------------------------------------------------

	Table E1: Correlation Between Quality Outcomes

------------------------------------------------------------------------------*/

* Assemble data
clear all
use "${data_built}pdb_full_structures.dta"
merge 1:1 structureId using "${data_built}pdb_analysis.dta"
assert _merge != 2
keep if _merge == 3
drop _merge

merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
keep if _merge != 2
drop _merge

merge 1:1 structureId using "${data_built}phat.dta"
assert _merge == 3
drop _merge

* Correlation matrix between outcomes
corr refinementResolution rFree ramaRaw
matrix Q_correlations = r(C)
preserve
	clear
	svmat Q_correlations
	export excel "${tables}/Tables.xlsx", sheet(tableE1) cell(D60) sheetmodify
restore	
	



